#!/usr/bin/python3
# -*- coding: utf-8 -*-
#author: 王树根
#date: 2016年3月15日 下午8:16:42
from scrapy.selector.unified import Selector
from scrapy.spiders import Spider
from globaltimes.items import GlobalTimesItem
import re


class GlobalTimesSpider(Spider):
    
    name = "globaltimes"
    
    allowed_domains = "globaltimes.cn"
    
    start_urls = {
        "http://www.globaltimes.cn/includes/indexchild.html"
    }
    
    def parse(self, response):
        html_sel = Selector(response)
        
        news_nodes = html_sel.xpath(u'//div[@id="more-item"]//div[@class="row-content"]')
        news_id_re = re.compile(r'.*?(\d+).*')
        for news_node in news_nodes:
            item = GlobalTimesItem()
            item['category'] = news_node.xpath(u'a[@class="channel"]/text()').extract()[0]
            item['headline'] = news_node.xpath(u'a[last()]/text()').extract()[0]
            item['news_link'] = news_node.xpath(u'a[last()]/@href').extract()[0]
            item['abstract'] = news_node.xpath(u'p/text()').extract()[0]
            
            if item['news_link']:
                news_id_match_obj = news_id_re.match(item['news_link'])
                if news_id_match_obj:
                    item['news_id'] = news_id_match_obj.group(1)
#              
            yield item